## Word Segmentation with Conditional Random Fields
#  Sean Welleck | 2014
#
# Runs the entire CRF word segmentation process.
# Given an input training file, and an unsegmented testing file,
# produces a segmented testing file using a trained model.
# Also scores the segmented file and outputs a score to STDOUT.

import crf_formatter
import features
import sys
import optparse
import os
from subprocess import call, Popen, PIPE
import tempfile

# Runs the training, testing, and evaluation.
# Takes several filenames as input:
#   trainset - segmented training set
#   testset  - unsegmented testing set
#   predfile - desired name for predicted output file
#   model    - desired filename for trained model
#   dictionary - dictionary file (used by scoring script)
#   targets  - segmented testing set
def run(trainset, testset, predfile, model, dictionary, targets):
	train(model, trainset)
	test(model, testset, predfile)
	evaluate(dictionary, targets, predfile)

# Convert a segmented training set to a tagged format and to features,
# then trains a model using CRFSuite.
#   model    - desired filename for trained model
#   trainset - segmented training set
def train(model, trainset):
	sys.stderr.write("Training.\n")
	training_tagfile = tempfile.NamedTemporaryFile(delete=False)
	training_features = tempfile.NamedTemporaryFile(delete=False)
	# tag the segmented training set
	crf_formatter.text_to_tagged(trainset, training_tagfile.name)
	# generate features for the tagged set
	features.generate(training_tagfile.name, training_features.name)

	# Call CRFSuite using the command line to train a CRF model.
	cmd = ["crfsuite",  "learn", "-m", model, training_features.name]
	call(cmd)

	# clear the temp files
	os.unlink(training_tagfile.name)
	os.unlink(training_features.name)

# Convert an unsegmented testing set to a tagged format and to features,
# use a trained model to tag the testing set, then convert back to a segmented
# format.
#   model 	- filename for trained model
#   testset - unsegmented testing set
#   outfile - desired file name for the segmented predictions
def test(model, testset, outfile):
	sys.stderr.write("Testing.\n")
	testing_tagfile = tempfile.NamedTemporaryFile(delete=False)
	testing_features = tempfile.NamedTemporaryFile(delete=False)
	# tag the testing file with placeholder tags
	crf_formatter.text_to_tagged(testset, testing_tagfile.name)
	# generate features for the tagged file
	features.generate(testing_tagfile.name, testing_features.name)
	
	# Generate predicted tags using the trained model
	sys.stderr.write("Predicting tags.\n")

	# Call CRFSuite using the command line to tag the testing_features
	# using your trained CRF model.
	cmd = ["crfsuite",  "tag", "-m", model, testing_features.name]
	predictions, _ = Popen(cmd, stdout=PIPE).communicate()
	
	# pair the testing characters with their predicted tags
	# convert the tagged file back to lines of text
	crf_formatter.predictions_to_text(testing_tagfile.name, predictions.split('\n'), outfile=outfile)
	
	# clear the temp files
	os.unlink(testing_tagfile.name)
	os.unlink(testing_features.name)

# Execute the score script using a dictionary, the target segmented test set,
# and the predicted test set generated by test().
def evaluate(dictionary, targets, predictions):
	sys.stderr.write("Evaluating predictions.\n")
	if not os.path.isfile("./score"):
		sys.stderr.write("score script not detected.")
		return
	cmd = ["./score",  dictionary, targets, predictions]
	score, _ = Popen(cmd, stdout=PIPE).communicate()
	print(score)

# Parse command-line arguments and run.
if __name__ == '__main__':
	optparser = optparse.OptionParser()
	optparser.add_option("-t", "--training_filename", dest="trainset", 
		default="data/training_seg.utf8", help="Training data filename.")
	optparser.add_option("-s", "--testing_filename", dest="testset", 
		default="data/test_unseg.utf8", help="Unsegmented Testing data filename.")
	optparser.add_option("-p", "--predictions_filename", dest="predfile", 
		default="output/predictions.utf8", help="Desired filename for test set predictions.")
	optparser.add_option("-m", "--model_filename", dest="model", 
		default="models/crf.model", help="Desired filename for model.")
	optparser.add_option("-d", "--dictionary", dest="dictionary", 
		default="data/dict.utf8", help="Dictionary used for testing.")
	optparser.add_option("-g", "--targets_filename", dest="targets", 
		default="data/test_seg.utf8", help="Target (truth) segmented test set.")
	(opts,_) = optparser.parse_args()

	run(opts.trainset, opts.testset, opts.predfile, 
		opts.model, opts.dictionary, opts.targets)
	